In [97]:
from IPython.core.display import Image 
Image(filename= 'Mor Consulting CMYK.jpg')


Out[97]:

Measuring Similarity And Clustering Data

Bart Baddeley - PyData 2014

Notebook available at git hub https://github.com/BartBaddeley/PyData-2014.git

Why cluster data?

Introducing K-means clustering

Some tips on using K-means

Choosing between solutions and deciding on K

Spectral clustering

Why cluster data?

1. As a preprocessing step prior to some other data analysis technique 2. As an exploratory data analysis technique in its own right

Applications

Data summarisation Recommendation Systems - Collaborative Filtering Customer Segmentation Document Clustering - Topic Modelling Biological Data Analysis - Gene networks Social Network Analysis

In [62]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['colors', 'eig']
`%pylab --no-import-all` prevents importing * from pylab and numpy

K - Means Clustering


In [98]:
import numpy as np
from sklearn.datasets.samples_generator import make_blobs, make_moons

# Generate sample data
np.random.seed(0)

centres = [[1, 1], [-0.5, 0], [1, -1]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.3,0.3]])

figure(figsize=(10, 10))
colors = ['r','b','g']
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    cluster_center = centres[k]
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)


Run K-means with K set to 3


In [64]:
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
##############################################################################
# Compute clustering with 3 Clusters

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')


Out[64]:
<matplotlib.text.Text at 0x36005b70>

K-means may not work well with data having different scales in different dimensions


In [65]:
# Generate sample data
np.random.seed(0)

centres = [[1, 0.75], [1, -0.75], [0, 0]]

X0, labels0_true = make_blobs(n_samples=300, centers=centres[0], cluster_std=[[0.6,0.1]])
X1, labels1_true = make_blobs(n_samples=300, centers=centres[1], cluster_std=[[0.6,0.1]])
X2, labels2_true = make_blobs(n_samples=300, centers=centres[2], cluster_std=[[0.6,0.1]])

X = np.concatenate((X0,X1,X2))
labels_true = np.concatenate((labels0_true,labels1_true+1,labels2_true+2))


figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    cluster_center = centres[k]
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
axis('equal')


Out[65]:
(-2.0, 4.0, -1.5, 1.5)

In [66]:
##############################################################################
# Compute clustering with 3 Clusters

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
  
axis('equal')
title('KMeans 3')


Out[66]:
<matplotlib.text.Text at 0x362489e8>

We can cope with irrelevant dimensions so long as they do not vary too much


In [71]:
# Generate sample data
np.random.seed(0)

centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,0.4]])

colors = ['r','b','g']

figure(figsize=(20, 6.666))
subplot(1,3,1)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,2)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,3)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')



In [72]:
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')


Out[72]:
<matplotlib.text.Text at 0x36a15f98>

Performance can be good even when there are many irrelevant dimensions as long as the variation is not too great


In [73]:
# Generate sample data
np.random.seed(0)

centres = [[1, 1, 0, 0, 0, 0, 0, 0, 0], [-0.5, 0, 0, 0, 0, 0, 0, 0, 0], [1, -1, 0, 0, 0, 0, 0, 0, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=0.4)

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')


Out[73]:
<matplotlib.text.Text at 0x36a2cda0>

But a large amount of variation in even a single irrelevant dimension can disrupt performance


In [74]:
# Generate sample data
np.random.seed(0)

centres = [[1, 1, 0], [-0.5, 0, 0], [1, -1, 0]]
X, labels_true = make_blobs(n_samples=1000, centers=centres, cluster_std=[[0.4,0.4,1.5]])

colors = ['r','b','g']

figure(figsize=(12, 4))
subplot(1,3,1)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,2)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 0], X[my_members, 2], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[0], cluster_center[2], c=col, marker='o', s=200)
    axis('equal')
    
subplot(1,3,3)
for k, col in zip(range(3), colors):
    my_members = labels_true == k
    scatter(X[my_members, 2], X[my_members, 1], c=col, marker='o',s=20) 
    cluster_center = centres[k]
    scatter(cluster_center[2], cluster_center[1], c=col, marker='o', s=200)
    axis('equal')



In [75]:
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centres = k_means_3.cluster_centers_

##############################################################################
# Plot result
distance = euclidean_distances(k_means_3_cluster_centres,
                               centres,
                               squared=True)
order = distance.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 10))
for k, col in zip(range(3), colors):
    my_wrong_members = (k_means_3_labels == order[k]) & (labels_true != k)  
    scatter(X[my_wrong_members, 0], X[my_wrong_members, 1],c = 'k',marker='x', s=200)               
    my_members = k_means_3_labels == order[k]
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
    scatter(centres[k][0], centres[k][1], marker ='o', c=col, s=200, alpha=0.8)
             
title('KMeans 3')


Out[75]:
<matplotlib.text.Text at 0x36f20cf8>

K - Means will not work for non-isotrpoic data not described well by a centroid


In [112]:
[X, true_labels] = make_moons(n_samples=1000, noise=.05)

scatter(X[:, 0], X[:, 1], c=col, marker='o',s=20) 
axis('equal') 
    
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_

for k, col in zip(range(2), colors):           
    my_members = k_means_2_labels == k
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)     
    cluster_center = k_means_2_cluster_centers[k]
    scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)


How can we choose between different clustering results?


In [76]:
# Generate sample data
np.random.seed(1)

centres = [[1, -1, 1.1, 1, 0, 2, 1], [1, -1, 1, 2, 0, 0, 0], [1, -1, 1.5, 0, 0, 0, 0]]

X, labels_true = make_blobs(n_samples=300, centers=centres, cluster_std = 0.4)
                            

k_means_3a = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3a.fit(X)
k_means_3a_labels = k_means_3a.labels_
k_means_3a_cluster_centres = k_means_3a.cluster_centers_

k_means_3b = KMeans(init='random', n_clusters=3, n_init=1, max_iter=1)
k_means_3b.fit(X)
k_means_3b_labels = k_means_3b.labels_
k_means_3b_cluster_centres = k_means_3b.cluster_centers_

distance_a = euclidean_distances(k_means_3a_cluster_centres,
                               centres,
                               squared=True)
order_a = distance_a.argmin(axis=0)

distance_b = euclidean_distances(k_means_3b_cluster_centres,
                               centres,
                               squared=True)
order_b = distance_b.argmin(axis=0)

# KMeans 3
figure(figsize=(10, 4))
wrong_a = 0
correct_a = 0
wrong_b = 0
correct_b = 0
for k in range(3):
    wrong_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true != k))                  
    correct_a += np.sum((k_means_3a_labels == order_a[k]) & (labels_true == k))
    wrong_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true != k))                  
    correct_b += np.sum((k_means_3b_labels == order_b[k]) & (labels_true == k))
    
subplot(1,2,1)
bar(range(2),[wrong_a, correct_a])
xticks([0.5,1.5],('Wrong','Correct'))
title('Run 1')
subplot(1,2,2)
bar(range(2),[wrong_b, correct_b])
xticks([0.5,1.5],('Wrong','Correct'))
title('Run 2')


Out[76]:
<matplotlib.text.Text at 0x365d7978>

The Sum Squared Error or "Inertia" gives one way to choose between different clustering results


In [77]:
bar(range(2),[k_means_3a.inertia_, k_means_3b.inertia_])
xticks([0.5,1.5],('Run 1','Run 2'))
title('Inertia')


Out[77]:
<matplotlib.text.Text at 0x36a1c898>

How can we choose K, the number of clusters?


In [78]:
# Generate sample data
np.random.seed(1)

centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)
X, labels_true = make_blobs(n_samples=1000, centers=centers, cluster_std= [0.5])

# train 3 different models
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_

k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(X)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_

k_means_4 = KMeans(init='k-means++', n_clusters=4, n_init=10)
k_means_4.fit(X)
k_means_4_labels = k_means_4.labels_
k_means_4_cluster_centers = k_means_4.cluster_centers_

figure(figsize=(10, 5))

bar(range(3),[k_means_2.inertia_, k_means_3.inertia_, k_means_4.inertia_])
xticks([0.5,1.5,2.5],('k=2\n inertia =','k=3','k=4'))
title("Inertia")


Out[78]:
<matplotlib.text.Text at 0x36ee62b0>

In [79]:
from sklearn import metrics

# Determine the silhouette scores
k_means_2_silhouette_score = metrics.silhouette_score(X, k_means_2_labels, metric='euclidean')
k_means_3_silhouette_score = metrics.silhouette_score(X, k_means_3_labels, metric='euclidean')
k_means_4_silhouette_score = metrics.silhouette_score(X, k_means_4_labels, metric='euclidean')

figure(figsize=(10, 5))

bar(range(3),[k_means_2_silhouette_score, k_means_3_silhouette_score, k_means_4_silhouette_score])
xticks([0.5,1.5,2.5],('k=2','k=3','k=4'))
title("Silhouette Coefficient")


Out[79]:
<matplotlib.text.Text at 0x38a2bcc0>

Summary so far

Good if data well described by centroids Be carefull if data has a different scale in different dimensions Remove irrelevant dimensions if possible Measuring similarity in high dimensions can get difficult Will not work well for non-isotropic data Use "Inertia" or SSE to choose between different clustering results with the same number of clusters Use "Silhouette Coefficient" to choose number of clusters

A different way to view the data


In [80]:
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)

sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)

figure(figsize=(10, 10))

imshow(similarity)


Out[80]:
<matplotlib.image.AxesImage at 0x38d38860>

In [81]:
figure(figsize=(12, 4))
# reorder according to class labels
ind_2 = np.argsort(k_means_2_labels)
subplot(1,3,1)
imshow(similarity[ind_2].T[ind_2])
title('K=2 \n' + str(k_means_2_silhouette_score))

ind_3 = np.argsort(k_means_3_labels)
subplot(1,3,2)
imshow(similarity[ind_3].T[ind_3])
title('K=3 \n' + str(k_means_3_silhouette_score))

ind_4 = np.argsort(k_means_4_labels)
subplot(1,3,3)
imshow(similarity[ind_4].T[ind_4])
title('K=4 \n' + str(k_means_4_silhouette_score))


Out[81]:
<matplotlib.text.Text at 0x38d47978>

Spectral clustering

Finds clusters by looking for connectivity between cluster members Not restricted to similarity based on Euclidean distance Works for non-isotropic data Provides a way to estimate the number of clusters

1. Construct a similarity graph

2. Compute the graph Laplacian

3. Determine the k smallest eigenvectors f1, f2, ....,fk

4. Construct the [number of data points x number of clusters] matrix F

5. Use k-means in this new space to find the k clusters

Unnormalised spectral clustering

Constructing the similarity graph


In [82]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)

sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)

# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
    s = np.sort(A[i])
    s = s[::-1] #reverse order
    A[i][A[i]<s[num_neighbours]] = 0
    
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)

# OR

# Mutual k-nearest neighbour graph 
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)

Compute the graph Laplacian


In [83]:
# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))

# Un-normalised Laplacian Matrix
L = D - A

Determine the k smallest eigenvectors


In [84]:
# Perform an Eigenvalue deccomposition
from scipy.linalg import eig
[eig_vals, eig_vecs] = eig(L)

sorted_inds = np.argsort(eig_vals.real, axis=0)

The multiplicity of the zero value (or near zero value) eigenvalues can give a strong clue as to the number of clusters


In [85]:
bar(range(10), eig_vals.real[sorted_inds[0:10]])


Out[85]:
<Container object of 10 artists>

Construct the [ number of data points x number of clusters ] matrix F


In [86]:
F = eig_vecs[:,sorted_inds[0:3]]

Use k-means in this new space to find the k clusters


In [87]:
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_

ind_3 = np.argsort(k_means_3_labels)

imshow(similarity[ind_3].T[ind_3])


Out[87]:
<matplotlib.image.AxesImage at 0x3d101710>

Normalised spectral clustering (Symmetric version)


In [88]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)

sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)

# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
    s = np.sort(A[i])
    s = s[::-1] #reverse order
    A[i][A[i]<s[num_neighbours]] = 0
    
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)

# OR

# Mutual k-nearest neighbour graph 
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)

# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))

Up to here it's the same


In [89]:
# Unnormalised Laplacian
L = np.matrix(D - A)
# Calculate inverse squre root of D - D^(-0.5)
D_minus_sqrt = np.matrix(np.diag(1/sqrt(np.sum(A, axis=0))))
# Normalised Laplacian Matrix
Lsym = D_minus_sqrt * L * D_minus_sqrt


[eig_vals, eig_vecs] = eig(L)
sorted_inds = np.argsort(eig_vals.real, axis=0)

F = eig_vecs[:,sorted_inds[0:3]]

Normalise the rows of F to be of unit length


In [90]:
import sklearn.preprocessing as preprocessing
Fnorm = preprocessing.normalize(F, norm='l2')

In [91]:
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_

ind_3 = np.argsort(k_means_3_labels)

imshow(similarity[ind_3].T[ind_3])


Out[91]:
<matplotlib.image.AxesImage at 0x3d2a8c88>

Normalised spectral clustering (Random walk version)


In [92]:
################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)

sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)

# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
    s = np.sort(A[i])
    s = s[::-1] #reverse order
    A[i][A[i]<s[num_neighbours]] = 0
    
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)

# OR

# Mutual k-nearest neighbour graph 
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)

# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))

# Unnormalised Laplacian
L = np.matrix(D - A)

Now perform a generalised eigenvalue decomposition


In [93]:
[eig_vals, eig_vecs] = eig(L, b = D)

sorted_inds = np.argsort(eig_vals.real, axis=0)

Construct F and perform k-means


In [94]:
F = eig_vecs[:,sorted_inds[0:3]]
# Now perform k-means in this more amenable space
k_means_3 = KMeans(init='k-means++', n_clusters=3, n_init=10)
k_means_3.fit(F)
k_means_3_labels = k_means_3.labels_
k_means_3_cluster_centers = k_means_3.cluster_centers_

ind_3 = np.argsort(k_means_3_labels)

imshow(similarity[ind_3].T[ind_3])


Out[94]:
<matplotlib.image.AxesImage at 0x3d349438>

In [113]:
[X, true_labels] = make_moons(n_samples=1000, noise=.05)

################################################################################
# Construct the similarity graph
# Calculate the squared distance between each point
distance = euclidean_distances(X,X,squared=True)

sig = 0.5
# Turn the distance into a similarity by applying a Gaussian kernel
similarity = np.exp(-distance/sig)

# Calculate the adjacency graph by sparsifying the simliarity matrix
A = (np.ones(similarity.shape)-np.eye(similarity.shape[0]))*similarity
# and sparsifying by only keeping the n most similar entries
num_neighbours = 5
for i in range(A.shape[0]):
    s = np.sort(A[i])
    s = s[::-1] #reverse order
    A[i][A[i]<s[num_neighbours]] = 0
    
# k-nearest neighbour graph - note this is a different k to the number of centres
#vertex i is a one of the k nearest neighbours of j OR vertex j is a one of the k nearest neighbours of i
A = np.fmax(A, A.T)

# OR

# Mutual k-nearest neighbour graph 
# vertex i is a one of the k nearest neighbours of j AND vertex j is a one of the k nearest neighbours of i
#A = np.fmin(A, A.T)

# Calculate the degree - the sum of all incoming weights to a vertex
D = np.diag(np.sum(A, axis=0))

# Un-normalised Laplacian Matrix
L = D - A

# Perform an Eigenvalue deccomposition
from scipy.linalg import eig
[eig_vals, eig_vecs] = eig(L)

sorted_inds = np.argsort(eig_vals.real, axis=0)

bar(range(10), eig_vals.real[sorted_inds[0:10]])


Out[113]:
<Container object of 10 artists>

In [114]:
F = eig_vecs[:,sorted_inds[0:2]]
# Now perform k-means in this more amenable space
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(F)
k_means_2_labels = k_means_2.labels_
k_means_2_cluster_centers = k_means_2.cluster_centers_

ind_3 = np.argsort(k_means_2_labels)

imshow(similarity[ind_3].T[ind_3])


Out[114]:
<matplotlib.image.AxesImage at 0x3d355a58>

In [116]:
for k, col in zip(range(2), colors):           
    my_members = k_means_2_labels == k
    scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)     
    cluster_center = k_means_2_cluster_centers[k]



In [ ]: